Variables:
Risk Age Sex Country
library(data.table)
library(tidyr)
library(maps)
library(haven)
library(ggplot2)
library(dplyr)
#read the data (Wave 5)
# Data of Wave 5
WV5_data <- readRDS("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/F00007944-WV5_Data_R_v20180912.rds")
# Convert WV5_data-object in data.frame
WV5_data_df <- as.data.frame(WV5_data)
# show first five columns
head(WV5_data_df[, 1:5])
library(dplyr)
#rename the variables
WV5_data <- WV5_data_df %>%
rename(gender = V235, age = V237, country_code = V2, wave = V1, risktaking = V86)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(gender, age, country_code, wave, risktaking)
WV5_data
#decode the country names
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country = countrynames$name [match(WV5_data$country_code, countrynames$code)]
table(WV5_data$country)
Andorra Argentina Australia Brazil Bulgaria Burkina Faso Canada
1003 1002 1421 1500 1001 1534 2164
Chile China Colombia Cyprus (G) Egypt Ethiopia Finland
1000 1991 3025 1050 3051 1500 1014
France Georgia Germany Ghana Great Britain Guatemala Hong Kong
1001 1500 2064 1534 1041 1000 1252
Hungary India Indonesia Iran Iraq Italy Japan
1007 2001 2015 2667 2701 1012 1096
Jordan Malaysia Mali Mexico Moldova Morocco Netherlands
1200 1201 1534 1560 1046 1200 1050
New Zealand Norway Peru Poland Romania Russia Rwanda
954 1025 1500 1000 1776 2033 1507
Slovenia South Africa South Korea Spain Sweden Switzerland Taiwan
1037 2988 1200 1200 1003 1241 1227
Thailand Trinidad and Tobago Turkey Ukraine United States Uruguay Viet Nam
1534 1002 1346 1000 1249 1000 1495
Zambia
1500
WV5_data
NA
NA
#Read Dataset (Wave 6)
WV6_data <- load("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
#rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, gender = V240, age = V242,country_code = V2, risktaking = V76)
#select only the variables of interest
WV6_data <- WV6_data %>%
select(wave, gender, age, country_code,risktaking)
WV6_data
NA
#decode daraset (Wave 6)
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country = countrynames$name [match(WV6_data$country_code, countrynames$code)]
table(WV6_data$country)
Algeria Argentina Armenia Australia Azerbaijan Belarus Brazil
1200 1030 1100 1477 1002 1535 1486
Chile China Colombia Cyprus (G) Ecuador Egypt Estonia
1000 2300 1512 1000 1202 1523 1533
Georgia Germany Ghana Haiti Hong Kong India Iraq
1202 2046 1552 1996 1000 4078 1200
Japan Jordan Kazakhstan Kuwait Kyrgyzstan Lebanon Libya
2443 1200 1500 1303 1500 1200 2131
Malaysia Mexico Morocco Netherlands New Zealand Nigeria Pakistan
1300 2000 1200 1902 841 1759 1200
Palestine Peru Philippines Poland Qatar Romania Russia
1000 1210 1200 966 1060 1503 2500
Rwanda Singapore Slovenia South Africa South Korea Spain Sweden
1527 1972 1069 3531 1200 1189 1206
Taiwan Thailand Trinidad and Tobago Tunisia Turkey Ukraine United States
1238 1200 999 1205 1605 1500 2232
Uruguay Uzbekistan Yemen Zimbabwe
1000 1500 1000 1500
WV6_data
#combine the 2 dataset (Wave 6 + Wave 5)
WV5_data
WV6_data
WVS_data = rbind(WV5_data, WV6_data)
WVS_data
NA
NA
#exclusion of participants and omission of missing data (na)
WVS_data = subset(WVS_data, risktaking > 0 & gender > 0 & age >0 )
data_Wave5 = subset(WV5_data, risktaking > 0 & gender > 0 & age >0 )
data_Wave6 = subset(WV6_data, risktaking > 0 & gender > 0 & age >0)
WVS_data <- na.omit(WVS_data)
data_Wave5 <- na.omit(data_Wave5)
data_Wave6 <- na.omit(data_Wave6)
# Transfrom risk item such that high values represent more risk taking
WVS_data$risktaking = 6 - WVS_data$risktaking + 1
# Transform risk variable into T-score (mean = 50, sd = 10)
WVS_data$T_score_risktaking = 10*scale(WVS_data$risktaking, center=TRUE,scale=TRUE)+50
WVS_data
#Transform risk variable into Z score
# Assuming T-scores have a mean of 50 and a standard deviation of 10
WVS_data$Z_score_risktaking = (WVS_data$T_score_risktaking - 50) / 10
# Print the resulting data frame
print(WVS_data)
NA
#World map
world_map <- map_data("world")
recorded_countries <- unique(WVS_data$country)
world_map$recorded <- ifelse(world_map$region %in% recorded_countries, "Recorded", "Not Recorded")
ggplot(world_map, aes(x = long, y = lat, group = group, fill = recorded)) +
geom_polygon(color = "white") +
scale_fill_manual(values = c("Recorded" = "red", "Not Recorded" = "lightgrey"), guide = "none") +
theme_void() +
labs(title = "WVS", fill = "Status") +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
# Load the dplyr package
library(dplyr)
# Assuming the data frame is called 'data' and the column containing the country information is called 'country'
country_counts <- WVS_data %>%
count(country)
# Print the result
print(country_counts)
NA
# read in file that contains hardship indicators manually collected from CIA factbook, WHO, and World Bank
# (see Supplemental Materials for URL sources)
countryfacts = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countryfacts_selection.csv", as.is = TRUE, header = TRUE)
# Create a vector of labels with the same length as the number of columns in 'countryfacts'
labels <- c("code","country","codeWVS","Homicide","GDP","InfMort","LifeExp","GINI","GenderPEdu","code2")
# Print the result
print(countryfacts)
# Load the dplyr package if not already loaded
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
# Create the 'hardship' column in the 'countryfacts' data frame
countryfacts <- countryfacts %>%
mutate(hardship = (homiciderate + gdp + infantmortality + lifeexpectancy + gini + femalemale_primedu) / 6)
countryfacts
# View the distribution of the 'hardship_index' column for each country
hardship_index_distribution <- countryfacts %>%
group_by(label) %>%
summarize(
mean = mean(hardship, na.rm = TRUE),
median = median(hardship, na.rm = TRUE),
sd = sd(hardship, na.rm = TRUE),
min = min(hardship, na.rm = TRUE),
max = max(hardship, na.rm = TRUE),
n = sum(!is.na(hardship))
)
Warning: There were 24 warnings in `summarize()`.
The first warning was:
ℹ In argument: `min = min(hardship, na.rm = TRUE)`.
ℹ In group 2: `label = "Andorra"`.
Caused by warning in `min()`:
! no non-missing arguments to min; returning Inf
ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 23 remaining warnings.
# Print the result
print(hardship_index_distribution)
#table with female percentage, mean age, mean risk taking per countries (summary of the countries)
library(dplyr)
table_data_WVS <- WVS_data %>%
group_by(country) %>%
summarise(
n = n(),
female_percentage = mean(gender == 1) * 100,
mean_age = mean(age, na.rm = TRUE),
age_range = paste(min(age, na.rm = TRUE), "-", max(age, na.rm = TRUE)),
mean_risktaking = mean(Z_score_risktaking, na.rm = TRUE)
)
table_data_WVS
#graph across countries: risk taking vs age vs gender (Z-score for age and risk taking)
ggplot(WVS_data, aes(scale(age), Z_score_risktaking, color = factor(gender))) +
geom_point(size = 0.1) +
geom_smooth(method = "lm") +
scale_color_manual(values = c("blue", "red"), labels = c("Male", "Female")) +
labs(color = "Gender")
WVS_data
NA
#regression table (risk taking and age -> Z-score)
regression_results_WVS <- WVS_data %>%
group_by(country) %>%
do(model = lm(Z_score_risktaking ~ scale(age) + gender, data = .)) %>%
summarize(
country = first(country),
intercept = coef(summary(model))[1, 1],
slope_age = coef(summary(model))[2, 1],
slope_gender = coef(summary(model))[3, 1]
)
regression_results_WVS
NA
NA
gps_data <- haven::read_dta("/Users/cristinacandido/Documents/Github/risk_wvs/data/individual_new.dta")
gps_data
# Clean the data by removing records with missing values
gps_data <- gps_data %>%
drop_na(country, isocode, risktaking, gender, age)
# Display the cleaned data
gps_data
#select only the variables of interest
gps_data <- gps_data %>%
select(country, isocode, ison, risktaking, gender, age)
gps_data
#Z-score for age
gps_data <- gps_data %>%
group_by(country) %>%
mutate(z_score_age = scale(age))
# Display the new column with Z-Scores per Country
gps_data
#table intercept and slope
regression_results_gps <- gps_data %>%
group_by(country) %>%
do(model = lm(risktaking ~ z_score_age + gender, data = .)) %>%
summarize(
country = first(country),
intercept = coef(summary(model))[1, 1],
slope_age = coef(summary(model))[2, 1],
slope_gender = coef(summary(model))[3, 1]
)
regression_results_gps
common_countries <- intersect(WVS_data$country, gps_data$country)
selected_countries <- c("Argentina", "Australia", "Brazil", "Canada", "Chile", "China", "Egypt", "Finland", "France", "Georgia",
"Germany", "Ghana", "Hungary", "India", "Indonesia", "Iran", "Japan", "Jordan", "Mexico", "Moldova",
"Morocco", "Netherlands", "Peru", "Poland", "Romania", "Russia", "Rwanda", "South Africa", "South Korea",
"Spain", "Sweden", "Switzerland", "Thailand", "Turkey", "Ukraine", "United States", "Algeria", "Colombia",
"Estonia", "Haiti", "Iraq", "Kazakhstan", "Nigeria", "Pakistan", "Philippines", "Zimbabwe")
# Filter the original dataset
new_WVS <- WVS_data[WVS_data$country %in% selected_countries, ]
# View the new dataset
new_WVS
NA
NA
selected_countries <- c("Argentina", "Australia", "Brazil", "Canada", "Chile", "China", "Egypt", "Finland", "France", "Georgia",
"Germany", "Ghana", "Hungary", "India", "Indonesia", "Iran", "Japan", "Jordan", "Mexico", "Moldova",
"Morocco", "Netherlands", "Peru", "Poland", "Romania", "Russia", "Rwanda", "South Africa", "South Korea",
"Spain", "Sweden", "Switzerland", "Thailand", "Turkey", "Ukraine", "United States", "Algeria", "Colombia",
"Estonia", "Haiti", "Iraq", "Kazakhstan", "Nigeria", "Pakistan", "Philippines", "Zimbabwe")
# Filter the original dataset
new_gps <- gps_data[gps_data$country %in% selected_countries, ]
# View the new dataset
new_gps
NA
regression_results_WVS_new <- new_WVS %>%
group_by(country) %>%
do(model = lm(Z_score_risktaking ~ scale(age) + gender, data = .)) %>%
summarize(
country = first(country),
intercept_WVS = coef(summary(model))[1, 1],
slope_age_WVS = coef(summary(model))[2, 1],
slope_gender_WVS = coef(summary(model))[3, 1]
)
regression_results_WVS_new
NA
regression_results_gps_new <- new_gps %>%
group_by(country) %>%
do(model = lm(risktaking ~ scale(age) + gender, data = .)) %>%
summarize(
country = first(country),
intercept_gps = coef(summary(model))[1, 1],
slope_age_gps = coef(summary(model))[2, 1],
slope_gender_gps = coef(summary(model))[3, 1]
)
regression_results_gps_new
NA
regression_results_gps_new
regression_results_WVS_new
# Assuming "country" is the common column
merged_results <- merge(regression_results_gps_new, regression_results_WVS_new, by = "country", all = TRUE)
merged_results
NA